
**prepare data for 2023 applicants
*****************HIGH SCHOOL BASIC INFORMATION**********************************
/*this file extracts data for high school information from Ministry of Education's webpage 
The rawdata can be downloaded from the following website:
https://view.officeapps.live.com/op/view.aspx?src=https%3A%2F%2Fstats.moe.gov.tw%2Ffiles%2Fdetail%2F111%2F111_base2.xlsx&wdOrigin=BROWSELINK
*/
clear
**import data for high school information from the first worksheet in 111_base2.xls
import excel "111_base2.xlsx", sheet("111_base2") cellrange(A3:AG3542) firstrow
drop if 學程等級名稱=="附設國中部" /*drop affiliated junior high schools*/ 

*calculate number of academic track senior students by gender by high schools 
preserve 
keep if 群別名稱=="學術群" /*keep only academic track students*/
collapse (sum) academicM3=三年級男 academicF3=三年級女 (first) 學校名稱2=學校名稱 , by(學校代碼)
/*collapse data to obtain no. of senior students by gender*/
sort  學校代碼
label var academicM3 "no. of academic-track male senior student in R's high school"
label var academicF3 "no. of academic-track female senior student in R's high school'"
save "academic111.dta", replace
restore 

*calculate total number of senior students (academic + vocation track) by gender by schools
preserve
collapse (sum) HM3=三年級男 HF3=三年級女 (first) 學校名稱3=學校名稱, by(學校代碼)
save "highsch111.dta", replace
restore 

/**imporat population data from Ministry of Interior, Department of Household Registration, M.O.I. 
the rawdata can be downloaded from the following website
https://www.ris.gov.tw/app/portal/346
the original file "鄉鎮市區人口數按性別及年齡分(96-111).xls" is renamed to "pop_gender_age111.xls"
compute population aged 17-18 in each administrative district (township, town, city, or district)
*/
clear
import excel  using  "pop_gender_age111.xls", sheet("111") cellrange(A6:DC790) firstrow clear allstring
keep gender areacode areaname city cityarea age_tot age17 age18  
sort cityarea
destring age_tot age17 age18, replace
*save "e:/NTU/work/age1718_Y112.dta", replace
collapse (sum) allpop=age_tot all17=age17 all18 = age18 (first) areacode city, by(cityarea)
clonevar areacode1 = areacode
destring areacode1, replace 
gen test = mod(areacode1, 1000)
drop if test == 0  /*drop the cases contained the subtotal in the city*/
drop test areacode1 
sort cityarea
save "all1718Y112.dta", replace

**import data for high school address and zipcode
/*the rawdata can be downloaded from the website of Ministry of Education
https://view.officeapps.live.com/op/view.aspx?src=https%3A%2F%2Fstats.moe.gov.tw%2Ffiles%2Fschool%2F111%2Fhigh.xls&wdOrigin=BROWSELINK
*/
*import the file from excel spreadsheet and and save the file as 111high_addr.dta 
clear 
import excel "111_high.xlsx", sheet("bc31") cellrange(A3:H519) firstrow allstring 
sort 學校代碼
save  "111high_addr.dta", replace

**merge high school address and zipcode (111high_addr.dta) to high school student data (high111.dta)
use "highsch111.dta", clear /*read in high school student data file*/
sort 學校代碼
merge 1:1 學校代碼 using 111high_addr.dta /*use school code to merge with high school address data*/
list	學校名稱	學校名稱3	學校代碼  就讀高中	_merge	if	_merge	<3 /*13  schools are missing*/
replace	 就讀高中=學校名稱3 if	_merge	==1 /*replace missing high school names*/
drop _merge
sort 學校代碼

**merge with number of academic track senior students
merge 1:1 學校代碼 using academic111.dta
drop 學校名稱2 
drop _merge
replace academicM3 = 0 if HM3~=. & academicM3==.
replace academicF3 = 0 if HF3~=. & academicF3==.
sort 就讀高中

*calculate total number of senior students (male+female) 
gen tot3=HM3+HF3
label var tot3 "total number of senior students in R's high school"
*calculate percentage of male student in high school
gen malep3=HM3/tot3
label var malep3 "% of male senior students in R's high school"

*creat categorical indicator for size of high school 
recode tot3 (0/99=1)(100/199=2)(200/299=3)(300/399=4)(400/499=5)(500/599=6)(600/699=7)(700/799=8)(800/899=9)(900/999=10)(1000/10000=11), gen(sizecat)
label define sizecat 1 "<100" 2 "100-199" 3 "200-299" 4 "300-399" 5 "400-499" 6 "500-599" 7 "600-699" 8 "700-799" 9 "800-899" 10 "900-999" 11 "1000+"
label value sizecat sizecat
label var sizecat "number of students in R's high school (dummy)"

**create percentage of academic track in high school 
gen academic3=academicM3+academicF3
label var academic3 "total no. of senior academic track students"
gen academicp =academic3/tot3
label var academicp "percentage of senior academic track students"
sort 學校代碼
save  "hsch_info111.dta", replace
************END OF HIGH SCHOOL BASIC INFORMATION********************************

***READ HOUSING PRICE INFORMATION***********************************************
/*the housing price information is taken from the following website https://price.houseprice.tw/list/
the python program to extract housing price information is stored in Replicatoin_Addr2HousePrice.zip
we provide a query dataset with 10 psudo address to test run the program */
*read housing price information
clear
import	excel	"address112.xlsx",	sheet("工作表1")	cellrange(A1:E21957)	firstrow   /*this file contained all applicant's street address, applicants who entered 2nd round were duplicated twice*/
sort addrm
merge m:1 addrm using "house_price_result112.dta"
drop id - addrm_kw _merge
sort dsource No 
save  "house_price112.dta", replace

***READ 2023 NTU applicants data **************************************************
*the data of student applicants are stored in three different worksheets
*the following section read the three worksheets and combined them into one file, the three worksheets contained information on: (1) those who failed the first round selection (2) those who passed 1st round but did not participate the 2nd round selection (3) those who entered the 2nd round the data 
**(1) read applicants who did not pass the first round selection
set more off
clear
import excel "applicant112.xlsx", sheet("112-未通過一階") cellrange(A1:Y11746) firstrow 
*destring  郵遞區號 , force replace 
*destring 學測應試號碼, force replace
gen dsource = 1
save "fail112.dta", replace

clear
**(2)read student applicants who passed the first round but did not enter the second round 
import excel "applicant112.xlsx", sheet("112-進二階+但未繳費") cellrange(A1:AO1444) firstrow  
gen dsource = 2
save "gaveup112.dta", replace
**append data  (1)+(2) N=11745 + 1443 = 13188
append using "fail112.dta"
replace 考生身份 = strtrim(考生身份)
replace 是否低收入戶 = strtrim(是否低收入戶)
save "notin2nd112.dta", replace 

*(3) read applicants who passed both 1st and 2nd round 
clear
*read 4384 students who passed the first round
import excel "applicant112.xlsx", sheet("112_進二階") cellrange(A1:BX4385) firstrow 
gen dsource = 3 
**append those who are not in second round*case number == 11745+1445+4383 =17572
append using "notin2nd112.dta"

label define dsource 1 "failed 1st round" 2 "gave up 2nd round" 3 "in 2nd round"
label value dsource dsource

drop if 類別 =="希望組" /*drop N=214 students from financially disadvantaged family who are on 'Hope Scholarship' and did not compete with other domestic undergraduate student* = 17572-214 = 17358*/


*replace 學系組名稱 = "國際體育運動事務學士學位學程" if 學系組名稱=="國際體育運動事務學程/共同教育中心"
 
clonevar 就讀高中 = s_edu_name /*make a copy of the original high school name*/
destring 校系代碼, force replace

*create label for 2023 department code, the label only apply to 2023 data
#delimit ; 
label define dept_l
1012	"中國文學系"
1022	"外國語文學系"
1032	"歷史學系"
1042	"哲學系"
1052	"人類學系"
1062	"圖書資訊學系"
1072	"日本語文學系"
1082	"戲劇學系(女)"
1092	"數學系"
1102	"物理學系"
1112	"化學系"
1122	"地質科學系"
1132	"心理學系"
1142	"地理環境資源學系"
1152	"大氣科學系"
1162	"政治學系政治理論組"
1172	"政治學系國際關係組"
1182	"政治學系公共行政組"
1192	"經濟學系A組"
1202	"經濟學系B組"
1212	"經濟學系C組"
1222	"社會學系"
1232	"社會工作學系"
1242	"醫學系"
1252	"醫學系(公費生)"
1262	"牙醫學系"
1272	"藥學系"
1282	"醫學檢驗暨生物技術學系"
1292	"護理學系"
1302	"物理治療學系"
1312	"職能治療學系"
1322	"土木工程學系"
1332	"機械工程學系"
1342	"化學工程學系"
1352	"工程科學及海洋工程學系"
1362	"材料科學與工程學系"
1372	"醫學工程學系"
1382	"農藝學系"
1392	"生物環境系統工程學系"
1402	"農業化學系"
1412	"森林環境暨資源學系"
1422	"動物科學技術學系"
1432	"農業經濟學系"
1442	"園藝暨景觀學系"
1452	"獸醫學系"
1462	"生物產業傳播暨發展學系"
1472	"生物機電工程學系"
1482	"昆蟲學系"
1492	"植物病理與微生物學系"
1502	"工商管理學系企業管理組"
1512	"工商管理學系科技管理組"
1522	"會計學系"
1532	"財務金融學系"
1542	"國際企業學系"
1552	"資訊管理學系"
1562	"資訊管理學系(資安組)"
1572	"公共衛生學系"
1582	"電機工程學系"
1592	"資訊工程學系"
1602	"資訊工程學系(APCS組)"
1612	"資訊工程學系(資安組)"
1622	"法律學系法學組"
1632	"法律學系司法組"
1642	"法律學系財經法學組"
1652	"生命科學系"
1662	"生化科技學系"
1672	"國際體育運動事務學士學位學程"; 
label value 校系代碼 dept_l;
#delimit cr

*replace department names missing from department codes for applicants failed 1st round
*note the department codes are not consistent between 2022 2023 data due to newly founded department
replace 學系組名稱=	"中國文學系"				if 校系代碼 ==		1012	&	學系組名稱 ==	""
replace 學系組名稱=	"外國語文學系"				if 校系代碼 ==		1022	&	學系組名稱 ==	""
replace 學系組名稱=	"歷史學系"					if 校系代碼 ==		1032	&	學系組名稱 ==	""
replace 學系組名稱=	"哲學系"					if 校系代碼 ==		1042	&	學系組名稱 ==	""
replace 學系組名稱=	"人類學系"					if 校系代碼 ==		1052	&	學系組名稱 ==	""
replace 學系組名稱=	"圖書資訊學系"				if 校系代碼 ==		1062	&	學系組名稱 ==	""
replace 學系組名稱=	"日本語文學系"				if 校系代碼 ==		1072	&	學系組名稱 ==	""
replace 學系組名稱=	"戲劇學系(女)"				if 校系代碼 ==		1082	&	學系組名稱 ==	""
replace 學系組名稱=	"數學系"					if 校系代碼 ==		1092	&	學系組名稱 ==	""
replace 學系組名稱=	"物理學系"					if 校系代碼 ==		1102	&	學系組名稱 ==	""
replace 學系組名稱=	"化學系"					if 校系代碼 ==		1112	&	學系組名稱 ==	""
replace 學系組名稱=	"地質科學系"				if 校系代碼 ==		1122	&	學系組名稱 ==	""
replace 學系組名稱=	"心理學系"					if 校系代碼 ==		1132	&	學系組名稱 ==	""
replace 學系組名稱=	"地理環境資源學系"			if 校系代碼 ==		1142	&	學系組名稱 ==	""
replace 學系組名稱=	"大氣科學系"				if 校系代碼 ==		1152	&	學系組名稱 ==	""
replace 學系組名稱=	"政治學系政治理論組"		if 校系代碼 ==		1162	&	學系組名稱 ==	""
replace 學系組名稱=	"政治學系國際關係組"		if 校系代碼 ==		1172	&	學系組名稱 ==	""
replace 學系組名稱=	"政治學系公共行政組"		if 校系代碼 ==		1182	&	學系組名稱 ==	""
replace 學系組名稱=	"經濟學系A組"				if 校系代碼 ==		1192	&	學系組名稱 ==	""
replace 學系組名稱=	"經濟學系B組"				if 校系代碼 ==		1202	&	學系組名稱 ==	""
replace 學系組名稱=	"經濟學系C組"				if 校系代碼 ==		1212	&	學系組名稱 ==	""
replace 學系組名稱=	"社會學系"					if 校系代碼 ==		1222	&	學系組名稱 ==	""
replace 學系組名稱=	"社會工作學系"				if 校系代碼 ==		1232	&	學系組名稱 ==	""
replace 學系組名稱=	"醫學系"					if 校系代碼 ==		1242	&	學系組名稱 ==	""
replace 學系組名稱=	"醫學系(公費生)"			if 校系代碼 ==		1252	&	學系組名稱 ==	""
replace 學系組名稱=	"牙醫學系"					if 校系代碼 ==		1262	&	學系組名稱 ==	""
replace 學系組名稱=	"藥學系"					if 校系代碼 ==		1272	&	學系組名稱 ==	""
replace 學系組名稱=	"醫學檢驗暨生物技術學系"	if 校系代碼 ==		1282	&	學系組名稱 ==	""
replace 學系組名稱=	"護理學系"					if 校系代碼 ==		1292	&	學系組名稱 ==	""
replace 學系組名稱=	"物理治療學系"				if 校系代碼 ==		1302	&	學系組名稱 ==	""
replace 學系組名稱=	"職能治療學系"				if 校系代碼 ==		1312	&	學系組名稱 ==	""
replace 學系組名稱=	"土木工程學系"				if 校系代碼 ==		1322	&	學系組名稱 ==	""
replace 學系組名稱=	"機械工程學系"				if 校系代碼 ==		1332	&	學系組名稱 ==	""
replace 學系組名稱=	"化學工程學系"				if 校系代碼 ==		1342	&	學系組名稱 ==	""
replace 學系組名稱=	"工程科學及海洋工程學系"	if 校系代碼 ==		1352	&	學系組名稱 ==	""
replace 學系組名稱=	"材料科學與工程學系"		if 校系代碼 ==		1362	&	學系組名稱 ==	""
replace 學系組名稱=	"醫學工程學系"				if 校系代碼 ==		1372	&	學系組名稱 ==	""
replace 學系組名稱=	"農藝學系"					if 校系代碼 ==		1382	&	學系組名稱 ==	""
replace 學系組名稱=	"生物環境系統工程學系"		if 校系代碼 ==		1392	&	學系組名稱 ==	""
replace 學系組名稱=	"農業化學系"				if 校系代碼 ==		1402	&	學系組名稱 ==	""
replace 學系組名稱=	"森林環境暨資源學系"		if 校系代碼 ==		1412	&	學系組名稱 ==	""
replace 學系組名稱=	"動物科學技術學系"			if 校系代碼 ==		1422	&	學系組名稱 ==	""
replace 學系組名稱=	"農業經濟學系"				if 校系代碼 ==		1432	&	學系組名稱 ==	""
replace 學系組名稱=	"園藝暨景觀學系"			if 校系代碼 ==		1442	&	學系組名稱 ==	""
replace 學系組名稱=	"獸醫學系"					if 校系代碼 ==		1452	&	學系組名稱 ==	""
replace 學系組名稱=	"生物產業傳播暨發展學系"	if 校系代碼 ==		1462	&	學系組名稱 ==	""
replace 學系組名稱=	"生物機電工程學系"			if 校系代碼 ==		1472	&	學系組名稱 ==	""
replace 學系組名稱=	"昆蟲學系"					if 校系代碼 ==		1482	&	學系組名稱 ==	""
replace 學系組名稱=	"植物病理與微生物學系"		if 校系代碼 ==		1492	&	學系組名稱 ==	""
replace 學系組名稱=	"工商管理學系企業管理組"	if 校系代碼 ==		1502	&	學系組名稱 ==	""
replace 學系組名稱=	"工商管理學系科技管理組"	if 校系代碼 ==		1512	&	學系組名稱 ==	""
replace 學系組名稱=	"會計學系"					if 校系代碼 ==		1522	&	學系組名稱 ==	""
replace 學系組名稱=	"財務金融學系"				if 校系代碼 ==		1532	&	學系組名稱 ==	""
replace 學系組名稱=	"國際企業學系"				if 校系代碼 ==		1542	&	學系組名稱 ==	""
replace 學系組名稱=	"資訊管理學系"				if 校系代碼 ==		1552	&	學系組名稱 ==	""
replace 學系組名稱=	"資訊管理學系(資安組)"		if 校系代碼 ==		1562	&	學系組名稱 ==	""
replace 學系組名稱=	"公共衛生學系"				if 校系代碼 ==		1572	&	學系組名稱 ==	""
replace 學系組名稱=	"電機工程學系"				if 校系代碼 ==		1582	&	學系組名稱 ==	""
replace 學系組名稱=	"資訊工程學系"				if 校系代碼 ==		1592	&	學系組名稱 ==	""
replace 學系組名稱=	"資訊工程學系(APCS組)"		if 校系代碼 ==		1602	&	學系組名稱 ==	""
replace 學系組名稱=	"資訊工程學系(資安組)"		if 校系代碼 ==		1612	&	學系組名稱 ==	""
replace 學系組名稱=	"法律學系法學組"			if 校系代碼 ==		1622	&	學系組名稱 ==	""
replace 學系組名稱=	"法律學系司法組"			if 校系代碼 ==		1632	&	學系組名稱 ==	""
replace 學系組名稱=	"法律學系財經法學組"		if 校系代碼 ==		1642	&	學系組名稱 ==	""
replace 學系組名稱=	"生命科學系"				if 校系代碼 ==		1652	&	學系組名稱 ==	""
replace 學系組名稱=	"生化科技學系"				if 校系代碼 ==		1662	&	學系組名稱 ==	""
replace 學系組名稱=	"國際體育運動事務學士學位學程"	if 校系代碼 ==		1672	&	學系組名稱 ==	""
replace 學系組名稱 = "戲劇學系(男)" 			if 學系組名稱 =="戲劇學系(女)" & 性別 == "男"

*generate college names from the department code
replace 學院 = "公共衛生學院" 	 	if 校系代碼 == 1572 & 學院 ==""    /* 2022--> 1562*/
replace 學院 = "共同教育中心"  		if 校系代碼 == 1672 & 學院 ==""    /* 2022--> 1652*/
replace 學院 = "工學院"  			if 校系代碼 >= 1322 & 校系代碼<=1372  & 學院 ==""
replace 學院 = "文學院"  			if 校系代碼 >= 1012 & 校系代碼<=1082  & 學院 ==""
replace 學院 = "法律學院"  			if 校系代碼 >= 1622 & 校系代碼<=1642  & 學院 =="" /*different 2022*/
replace 學院 = "理學院"  			if 校系代碼 >= 1092 & 校系代碼<=1152  & 學院 ==""
replace 學院 = "生命科學院"  		if 校系代碼 >= 1652 & 校系代碼<= 1662  & 學院 =="" /*different*/
replace 學院 = "生物資源暨農學院"  	if 校系代碼 >=  1382 & 校系代碼<=1492  & 學院 ==""
replace 學院 = "社會科學院"  		if 校系代碼 >=  1162 & 校系代碼<=1232  & 學院 ==""
replace 學院 = "管理學院"  			if 校系代碼 >=  1502 & 校系代碼<=1562  & 學院 =="" /*different*/
replace 學院 = "醫學院"  			if 校系代碼 >=  1242  & 校系代碼<=1312  & 學院 ==""
replace 學院 = "電機資訊學院"  		if 校系代碼 >=1582 & 校系代碼 <= 1612  & 學院 =="" /*different*/
 *create college codes
encode 學院, generate(college)

*create gender variable
encode 性別, generate(gender)
gen female = gender==1
rename s_number 學校代碼 /*rename school code variable*/

******correct high school names and codes 
replace 就讀高中 = "市立大同高中" if 就讀高中 == "私立大同高中"
replace 學校代碼 = "343302" if 學校代碼 == "341302"

replace 學校代碼 = "330301" if 就讀高中 == "國立師大附中" /*411 cases corrected*/
replace 學校代碼 = "900001" if 就讀高中 =="中正國防幹部預備學校" /*1 case assign pseudo idno*/
replace 學校代碼 = "400144" if 就讀高中 == "國立臺灣戲曲學院附設高職部" /*1 case corrected*/
replace 學校代碼 = "423301" if 就讀高中 ==  "市立復興高中" /*4 cases corrected*/
replace 學校代碼 = "014326" if 就讀高中 == "市立明德高中" /*2 cases corrected */
replace 學校代碼 = "193407" if 就讀高中 == "市立臺中高工" /* 1 cases corrected */
replace 學校代碼 = "134334" if 就讀高中 == "縣立來義高中" /* 1 cases corrected */

**add city names (as prefix) to school names to distinguish high schools with same names but located in different districts 
replace 就讀高中 = "臺北市立中正高中" if 學校代碼 =="423302"
replace 就讀高中 = "新北市立新莊高中" if 學校代碼 =="013337"
replace 就讀高中 = "高雄市立新莊高中" if 學校代碼 =="533302"
replace 就讀高中 = "桃園市立陽明高中" if 學校代碼 =="033325"

replace 學校代碼 ="0" if 學校代碼 ==""  /*assign code 0 to forign high school or alternative education*/

**merge high school information from hsch_info111.dta
merge m:1 學校代碼 using  "hsch_info111.dta"
*drop cases from using file
drop if _merge==2

********merge population of aged 17-18 from all1718.dta*************************
**extract the names of adminstrative district from student's household address
*manually correct the names of adminstrative district becuase of data parsing failure
clonevar cityarea = 戶籍地址區
replace cityarea =	"臺北市文山區"	if cityarea ==	"北市文山區興"
replace cityarea =	"臺中市南區"	if cityarea ==	"臺中市南區福"
replace cityarea =	"臺中市南區"	if cityarea ==	"臺中市南區懷"
replace cityarea =	"臺北市中正區"	if cityarea ==	"臺北市汀州路"
replace cityarea =	"臺北市信義區"	if cityarea ==	"臺北市忠孝東"
replace cityarea =	"臺北市信義區"	if cityarea ==	"臺北市信義路"
replace cityarea =	"臺北市南港區"	if cityarea ==	"臺北市南港福"
replace cityarea =	"臺東縣臺東市"	if cityarea ==	"台東市文心街"
replace cityarea =	"臺東縣臺東市"	if cityarea ==	"台東市志航路"
replace cityarea =	"臺北市文山區"	if cityarea ==	"指南路二段"
replace cityarea =	"高雄市路竹區"	if cityarea ==	"高雄市路" 
replace cityarea =	"新北市新店區"	if cityarea ==	"新北市新店中"
replace cityarea =	"新北市萬里區"	if cityarea ==	"新北市萬里"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市八德路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市三民路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市大學路"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市中山路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市中央路"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市中正路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市中華路"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市中興路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市公園路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市公道五"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市水田街"
replace cityarea =	"新竹市香山區"	if cityarea ==	"新竹市牛埔北"
replace cityarea =	"新竹市香山區"	if cityarea ==	"新竹市牛埔路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市世傑路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市北大路"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市北區成"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市北區竹"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市北區延"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市北區東"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市北區武"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市北區金"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市北區湳"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市北區境"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市北新街"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市四維路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市民主路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市民族路"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市民富里"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市民權路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市田美二"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市立鵬路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市光復路"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市光華一"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市光華二"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市光華東"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市光華街"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市江山街"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市竹文街"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市竹光路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市竹村三"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市自由路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市西大路"
replace cityarea =	"新竹市香山區"	if cityarea ==	"新竹市育德街"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市和平路"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市延平路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市忠孝路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市明湖路"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市東大路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東門街"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東南街"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區八"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區三"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區大"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區仁"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區公"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區水"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區仙"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區仰"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區光"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區安"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區竹"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區明"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區東"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區金"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區南"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區建"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區科"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區軍"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區埔"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區柴"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區崇"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區湖"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區園"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區慈"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區新"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區福"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區綠"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區龍"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區豐"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區關"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市東區鐵"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市武陵路"
replace cityarea =	"新竹市香山區"	if cityarea ==	"新竹市虎林街"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市金山二"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市金竹路"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市金雅7"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市金雅里"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市南大路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市建中一"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市建功里"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市食品路"
replace cityarea =	"新竹市香山區"	if cityarea ==	"新竹市香北一"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市埔頂二"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市埔頂三"
replace cityarea =	"桃園市桃園區"	if cityarea ==	"新竹市桃園市"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市高翠路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市勝利路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市湖濱三"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市湳雅街"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市園區一"
replace cityarea =	"新竹市香山區"	if cityarea ==	"新竹市新香街"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市新莊街"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市經國路"
replace cityarea =	"新竹市北區"	if cityarea ==	"新竹市境福里"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市綠水里"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市德成街"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市龍山東"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市關東路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市關新二"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市關新里"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市關新路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市寶山路"
replace cityarea =	"新竹市東區"	if cityarea ==	"新竹市鐵道路"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市大雅路"
replace cityarea =	"嘉義市西區"	if cityarea ==	"嘉義市大溪里"
replace cityarea =	"嘉義市西區"	if cityarea ==	"嘉義市大聖南"
replace cityarea =	"嘉義市西區"	if cityarea ==	"嘉義市大聖路"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市五福街"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市文心街"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市文雅街"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市日新街"
replace cityarea =	"嘉義市西區"	if cityarea ==	"嘉義市世賢路"
replace cityarea =	"嘉義市西區"	if cityarea ==	"嘉義市民生南"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市圳頭里"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市安和街"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市安寮里"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市安樂街"
replace cityarea =	"嘉義市西區"	if cityarea ==	"嘉義市竹圍里"
replace cityarea =	"嘉義市西區"	if cityarea ==	"嘉義市西平里"
replace cityarea =	"嘉義市西區"	if cityarea ==	"嘉義市西區劉"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市東洋新"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市東區文"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市東區後"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市東區興"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市東區豐"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市東義"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市林森西"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市林森東"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市芳安路"
replace cityarea =	"嘉義市西區"	if cityarea ==	"嘉義市垂楊路"
replace cityarea =	"嘉義市西區"	if cityarea ==	"嘉義市姜文街"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市宣信街"
replace cityarea =	"嘉義市西區"	if cityarea ==	"嘉義市湖美二"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市雅竹路"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市圓福街"
replace cityarea =	"嘉義市西區"	if cityarea ==	"嘉義市新民路"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市新生路"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市義教東"
replace cityarea =	"嘉義市西區"	if cityarea ==	"嘉義市劉厝里"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市興中街"
replace cityarea =	"嘉義市西區"	if cityarea ==	"嘉義市興業西"
replace cityarea =	"嘉義市東區"	if cityarea ==	"嘉義市興業東"
replace cityarea =	"嘉義市西區"	if cityarea ==	"嘉義市興達路"
replace cityarea =	"嘉義市西區"	if cityarea ==	"嘉義市遼寧一"
replace cityarea =	"嘉義市西區"	if cityarea ==	"嘉義市蘭州四"
replace cityarea =	"臺中市中區"	if cityarea ==	"臺中市中區三"
replace cityarea =	"臺中市中區"	if cityarea ==	"臺中市中區中"
replace cityarea =	"臺中市中區"	if cityarea ==	"臺中市中區柳"
replace cityarea =	"臺北市士林區"	if cityarea ==	"臺北市士東路"
replace cityarea =	"臺南市中西區"	if cityarea ==	"臺南市中西"
**replace the word 台 with 臺 if any
replace 	cityarea=	"臺中市北屯區"	if cityarea==	"台中市北屯區"
replace 	cityarea=	"臺中市大里區"	if cityarea==	"台中市大里區"
replace 	cityarea=	"臺北市萬華區"	if cityarea==	"台北市萬華區"
replace 	cityarea=	"臺南市安南區"	if cityarea==	"台南市安南區"
replace 	cityarea=	"臺南市官田區"	if cityarea==	"台南市官田區"
replace 	cityarea=	"臺東縣卑南鄉"	if cityarea==	"台東縣卑南鄉"
replace 	cityarea=	"臺東縣臺東市"	if cityarea==	"台東縣台東市"
replace 	cityarea=	"臺東縣池上鄉"	if cityarea==	"台東縣池上鄉"
replace 	cityarea=	"臺東縣綠島鄉"	if cityarea==	"台東縣綠島鄉"
replace 	cityarea=	"嘉義市西區"	if cityarea==	"嘉義市新榮路"
replace 	cityarea=	"臺中市南屯區"	if cityarea==	"臺中市南屯"
**replace the missing district names based on the street names
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市世賢路"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市保健街"
replace cityarea=	"嘉義市西區"	if cityarea ==	"嘉義市光彩街"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市公明路"
replace cityarea=	"嘉義市西區"	if cityarea ==	"嘉義市博愛路"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市博東路"
replace cityarea=	"嘉義市西區"	if cityarea ==	"嘉義市向榮街"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市吳鳳南"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市國華街"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市垂楊路"
replace cityarea=	"嘉義市西區"	if cityarea ==	"嘉義市大平街"
replace cityarea=	"嘉義市西區"	if cityarea ==	"嘉義市忠義街"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市文雅街"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市林森東"
replace cityarea=	"嘉義市西區"	if cityarea ==	"嘉義市武昌街"
replace cityarea=	"嘉義市西區"	if cityarea ==	"嘉義市永樂二"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市立仁路"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市維和街"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市維新路"
replace cityarea=	"嘉義市西區"	if cityarea ==	"嘉義市育人路"
replace cityarea=	"嘉義市東區"	if cityarea ==	"嘉義市蘭潭里"
replace cityarea=	"嘉義市西區"	if cityarea ==	"嘉義市西區文"
replace cityarea=	"屏東縣屏東市"	if cityarea ==	"屏東縣屏東縣"
replace cityarea=	"彰化縣員林市"	if cityarea ==	"彰化縣員林鎮"
replace cityarea=	"新北市八里區"	if cityarea ==	"新北市八里"
replace cityarea=	"新北市永和區"	if cityarea ==	"新北市永和保"
replace cityarea=	"新北市永和區"	if cityarea ==	"新北市永和郵"
replace cityarea=	"新北市萬里區"	if cityarea ==	"新北市萬里"
replace cityarea=	"新北市蘆洲區"	if cityarea ==	"新北市蘆洲玉"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市三民路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市中央路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市中正路"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市中清路"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市中華路"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市中雅里"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市光復路"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市光華二"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市公道五"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市北大路"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市國光街"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市園後街"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市埔頂一"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市學府路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市安富街"
replace cityarea=	"新竹市香山區"	if cityarea ==	"新竹市富群街"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市延平路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市建中一"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市建功二"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市建興街"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市忠孝路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市慈雲路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市明湖路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市東區竹"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市東南街"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市東大路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市東山街"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市東門街"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市武陵路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市民權路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市民生路"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市湳中街"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市湳雅街"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市磐石里"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市福德里"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市竹光路"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市經國路"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市西大路"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市西門街"
replace cityarea=	"新竹市北區"	if cityarea ==	"新竹市鐵道路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市關新路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市關東路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市食品路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市高峰路"
replace cityarea=	"新竹市東區"	if cityarea ==	"新竹市龍山東"
replace cityarea=	"臺中市中區"	if cityarea ==	"臺中市中區柳"
replace cityarea=	"臺中市中區"	if cityarea ==	"臺中市中區民"
replace cityarea=	"臺中市中區"	if cityarea ==	"臺中市中區自"
replace cityarea=	"臺中市北區"	if cityarea ==	"臺中市北區大"
replace cityarea=	"臺中市北區"	if cityarea ==	"臺中市北區英"
replace cityarea=	"臺中市北區"	if cityarea ==	"臺中市北區青"
replace cityarea=	"臺中市南區"	if cityarea ==	"臺中市南區三"
replace cityarea=	"臺中市南區"	if cityarea ==	"臺中市南區復"
replace cityarea=	"臺中市南區"	if cityarea ==	"臺中市南區柳"
replace cityarea=	"臺中市南區"	if cityarea ==	"臺中市南區樹"
replace cityarea=	"臺中市東區"	if cityarea ==	"臺中市東區東"
replace cityarea=	"臺中市西區"	if cityarea ==	"臺中市西區五"
replace cityarea=	"臺中市西區"	if cityarea ==	"臺中市西區公"
replace cityarea=	"臺中市西區"	if cityarea ==	"臺中市西區昇"
replace cityarea=	"臺中市西屯區"	if cityarea ==	"臺中市西屯福"
replace cityarea=	"臺北市萬華區"	if cityarea ==	"臺北市西藏路"
replace cityarea=	"臺南市佳里區"	if cityarea ==	"臺南市佳里"
replace cityarea=	"花蓮縣玉里鎮"	if cityarea ==	"花蓮縣玉里"
replace cityarea=	"高雄市路竹區"	if cityarea ==	"高雄市路" 
replace cityarea=	"高雄市鼓山區"	if cityarea ==	"高雄市鼓山二"
replace cityarea = "花蓮市玉里鎮" if cityarea =="花蓮市玉里鄉"
replace cityarea = "桃園市中壢區" if cityarea == "桃園縣中壢區"
replace cityarea = "桃園市龜山區" if cityarea == "桃園縣龜山鄉"
replace cityarea = "臺北市內湖區" if cityarea == "臺北市內湖路"
replace cityarea = "臺東縣太麻里鄉" if cityarea == "臺東縣太麻里"
replace cityarea = "臺東縣金峰鄉" if cityarea == "臺東縣金鋒鄉"
replace cityarea = "臺北市文山區" if cityarea == "新北市文山區"
replace cityarea = "高雄市三民區" if 戶籍地址區 == "" & 通訊地址 =="高雄市三民區遼寧三街" /*replace 2 cases with 通訊地址*/
replace cityarea = "高雄市楠梓區" if 戶籍地址區 == "" & 通訊地址 =="高雄市楠梓區楠梓路" /*replace 1 case with 通訊地址*/
**destring id number
destring 學測應試號碼, force replace

**merge housing price from external file
drop _merge
**匯入房價資料
sort dsource No
merge 1:1 dsource No using "house_price112.dta"
tab dsource _merge 
drop if _merge==2 /*drop duplicated cases in the external file n=4598 = 36+4427+135: dsource =1 n=36 from hope program, dsource=3 n=135 hope, dsource=2 4384(in 2nd) +43 (hope)=4427*/
drop _merge

**compute average housing price in administrative district
preserve 
collapse  (sum) 抓取筆數 (mean) 單價平均 - 屋齡平均, by(cityarea) 
drop if cityarea==""
save "cityarea_hp112.dta" , replace
restore

**updatea missing housing price with average price in the adminstrative district
sort cityarea
merge m:1 cityarea using "cityarea_hp112.dta", update
drop _merge
**manually update the remaining missing cases with the nearest housing price
replace 單價平均 = 17.4 	if 戶籍地址 =="南投縣水里鄉南光村六合二街"
replace 單價中位數 = 17.4 	if 戶籍地址 =="南投縣水里鄉南光村六合二街"
replace 單價平均 = 19.1 	if 戶籍地址=="臺東縣金鋒鄉嘉蘭村"
replace 單價中位數 = 18.8 	if 戶籍地址=="臺東縣金鋒鄉嘉蘭村"
replace 單價平均 = 20 		if 戶籍地址=="苗栗縣後龍鎮北龍里6鄰車站街"
replace 單價中位數 = 20 	if 戶籍地址=="苗栗縣後龍鎮北龍里6鄰車站街"
replace 單價平均 = 17 		if 戶籍地址=="金門縣烏坵鄉小坵村"
replace 單價中位數 = 17 	if 戶籍地址=="金門縣烏坵鄉小坵村"
replace 單價平均 = 10 		if 戶籍地址=="高雄市六龜區寶來里中正路"
replace 單價中位數 = 10 	if 戶籍地址=="高雄市六龜區寶來里中正路"
replace 單價平均 = 5 		if 戶籍地址=="高雄市那瑪夏區達卡努瓦里"
replace 單價中位數 = 5 		if 戶籍地址=="高雄市那瑪夏區達卡努瓦里"

*merge the population aged 17-18 
merge m:1 cityarea using "all1718Y112.dta"
drop if _merge==2 /*drop cases only in using file*/

**calculate high school rank = 1st round acceptance rate by high school =
*(no. of students passed 1st round in R's high school)/(total no. of senior students in R's high school)
preserve
keep if dsource >=2  /*select those who passed 1st round*/
bysort 就讀高中: gen hsch_ntu = _N /*calucate no. of students passed 1st round by high school*/
label var hsch_ntu "就讀高中一階進台大人數"
collapse (mean) hsch_ntu, by(就讀高中)
save "hsch_ntu112.dta", replace
restore 

drop _merge
merge m:1 就讀高中 using hsch_ntu112.dta
recode hsch_ntu (.=0) if _merge==1 /*schools not in the using file have no students passed 1st round*/
gen hschrank = hsch_ntu/tot3 
label var hschrank "high school rank = % of senior students passed 1st round exam"


***create three-digit zip code 
gen cityarea2 = subinstr(cityarea, "鄉", "", 1)
replace cityarea2 =subinstr(cityarea2, "鎮", "", 1)
*replace cityarea2 =subinstr(cityarea2, "市", "", 1)
replace cityarea2 = "高雄市前鎮區" if cityarea2 == "高雄市前區"
replace cityarea2 = "桃園市平鎮區" if cityarea2 == "桃園市平區"
replace cityarea2 =	"南投縣南投"	if cityarea2 ==	"南投縣南投市"
replace cityarea2 =	"嘉義縣嘉義"	if cityarea2 ==	"嘉義市東區"
replace cityarea2 =	"嘉義縣嘉義"	if cityarea2 ==	"嘉義市西區"
replace cityarea2 =	"嘉義縣太保"	if cityarea2 ==	"嘉義縣太保市"
replace cityarea2 =	"嘉義縣朴子"	if cityarea2 ==	"嘉義縣朴子市"
replace cityarea2 =	"宜蘭縣宜蘭"	if cityarea2 ==	"宜蘭縣宜蘭市"
replace cityarea2 =	"屏東縣屏東"	if cityarea2 ==	"屏東縣屏東市"
replace cityarea2 =	"彰化縣員林"	if cityarea2 ==	"彰化縣員林市"
replace cityarea2 =	"彰化縣彰化"	if cityarea2 ==	"彰化縣彰化市"
replace cityarea2 =	"新竹縣新竹"	if cityarea2 ==	"新竹市北區"
replace cityarea2 =	"新竹縣新竹"	if cityarea2 ==	"新竹市東區"
replace cityarea2 =	"新竹縣新竹"	if cityarea2 ==	"新竹市香山區"
replace cityarea2 =	"新竹縣竹北"	if cityarea2 ==	"新竹縣竹北市"
replace cityarea2 =	"澎湖縣七美"	if cityarea2 ==	"澎湖縣七美"
replace cityarea2 =	"澎湖縣望安"	if cityarea2 ==	"澎湖縣望安"
replace cityarea2 =	"澎湖縣湖西"	if cityarea2 ==	"澎湖縣湖西"
replace cityarea2 =	"澎湖縣白沙"	if cityarea2 ==	"澎湖縣白沙"
replace cityarea2 =	"澎湖縣西嶼"	if cityarea2 ==	"澎湖縣西嶼"
replace cityarea2 =	"澎湖縣馬公"	if cityarea2 ==	"澎湖縣馬公市"
replace cityarea2 =	"臺東縣臺東"	if cityarea2 ==	"臺東縣臺東市"
replace cityarea2 =	"花蓮縣花蓮"	if cityarea2 ==	"花蓮縣花蓮市"
replace cityarea2 =	"苗栗縣苗栗"	if cityarea2 ==	"苗栗縣苗栗市"
replace cityarea2 =	"苗栗縣頭份"	if cityarea2 ==	"苗栗縣頭份市"
replace cityarea2 =	"雲林縣斗六"	if cityarea2 ==	"雲林縣斗六市"

do "convert hh_address to zipcode3.do"

gen housing = 單價中位數
label var housing "median housing price"

gen lnhousingp=ln(housing)
label var lnhousingp "log(median housing price)"

***combine the sub-field in the same department into one department name 
gen bname = 學系組名稱
replace bname=	"工商管理學系"	if bname==	"工商管理學系企業管理組"	
replace bname=	"工商管理學系"	if bname==	"工商管理學系科技管理組"	
replace bname=	"法律學系"	if bname==	"法律學系司法組"	
replace bname=	"法律學系"	if bname==	"法律學系法學組"	
replace bname=	"法律學系"	if bname==	"法律學系財經法學組"	
replace bname=	"政治學系"	if bname==	"政治學系公共行政組"	
replace bname=	"政治學系"	if bname==	"政治學系政治理論組"	
replace bname=	"政治學系"	if bname==	"政治學系國際關係組"	
replace bname=	"經濟學系"	if bname==	"經濟學系A組"	
replace bname=	"經濟學系"	if bname==	"經濟學系B組"	
replace bname=	"經濟學系"	if bname==	"經濟學系C組"	
replace bname=	"戲劇學系"	if bname==	"戲劇學系(女)"	
replace bname=	"戲劇學系"	if bname==	"戲劇學系(男)"	
replace bname=	"醫學系"	if bname==	"醫學系"	
replace bname=	"醫學系"	if bname==	"醫學系(公費生)"	 
replace bname=  "資訊工程學系" if bname=="資訊工程學系(APCS組)"	
replace bname=  "資訊工程學系" if bname=="資訊工程學系(資安組)"	
replace bname=  "資訊管理學系" if bname=="資訊管理學系(資安組)"	
sort bname
drop _merge
label var bname "department name"


/*the following two datasets were downloaded from Ministrey of Education's website:
https://depart.moe.edu.tw/ED4500/News_Content.aspx?n=5A930C32CC6C3818&sms=91B3AAE8C6388B96&s=4F9035F0AF08D527
*/

/*the data for faculty can be downloaded from the following link:
https://view.officeapps.live.com/op/view.aspx?src=https%3A%2F%2Fstats.moe.gov.tw%2Ffiles%2Fdetail%2F111%2F111_teachers.xlsx&wdOrigin=BROWSELINK
*/
***get number of faculty by departments
preserve
clear 
import excel "111_teachers.xlsx", sheet("teachers(總)") cellrange(A6:P4356) firstrow allstring 
keep if scode =="0003" /*keep only NTU data*/
drop scode sname dn
sort bname
save  "NTU_t111.dta", replace
restore 
***get number of students by departments
/*the data can be downloaded from the following MOE website:
https://view.officeapps.live.com/op/view.aspx?src=https%3A%2F%2Fstats.moe.gov.tw%2Ffiles%2Fdetail%2F111%2F111_students.xlsx&wdOrigin=BROWSELINK
*/
preserve
clear 
import excel "111_students.xlsx", sheet("students") cellrange(A4:Y9779) firstrow allstring 
keep if scode =="0003"
keep if degree == "B 學士"
drop scode sname dn degree
sort bname
save  "NTU_s111.dta", replace
restore 

sort bname
**merge the students profile of the department 
merge m:1 bname using "NTU_s111.dta"
drop if _merge==2
drop _merge

**merge faculty profile of the department
merge m:1 bname using "NTU_t111.dta"
drop if _merge==2
drop _merge

destring stotal - lecturer_f , force replace
**calculate percentage of female students in the department
gen s_malep = mtot/stotal
gen s_femalep =ftot/stotal
label var s_malep "% of male students in the department"
label var s_femalep  "% of female students in the department"
***calculate percentage of fist year female students 
gen s1_malep = m1/(m1+f1)
gen s1_femalep =f1/(m1+f1)
label var s1_malep "% of male 1st yr students in the department"
label var s1_femalep  "% of female 1st yr students in the department"
*calculate the percentage of opposite sex in the department 

gen s_samesex = s_femalep if female==1
replace s_samesex = s_malep if female==0
label var s_samesex "% of same sex students in the department"

**calculate the percentage of male/female faculty in the department計算系所老師的性別比例
gen t_malep = ttot_m/ttotal
gen t_femalep = ttot_f/ttotal
label var t_malep "% of male faculty in the department"
label var t_femalep "% of female faculty in the department"
gen t_samesex = t_malep if female == 0
replace t_samesex = t_femalep if female==1
label var t_samesex "% of same sex faculty in the department"

**single sex high school- one sex exceeded 95%
gen allmale=malep3>.94
gen allfemale=malep3<.07
label var allmale "all male high school"
label var allfemale "all female high school"

***create variable for application outcomes
gen outcome = 3 	if 錄取別=="正取" & dsource==3
replace outcome = 2 if 錄取別=="備取" & dsource==3
replace outcome = 1 if 錄取別=="" & dsource==3
replace outcome = 0 if dsource==2
replace outcome = -1 if dsource==1

label var outcome "application outcomes"
label define outcome -1 "did not passed 1st round" 0 "not in 2nd rd" 1 "rejected" 2 "waiting" 3 "admitted" , replace
label value outcome outcome


***create indicator for minority and remote islander status
replace 考生身份 = strtrim(考生身份)
gen minority = 0 if  考生身份 =="一般考生" | 考生身份 =="一般生"
replace minority = 1 if 考生身份 =="原住民生" |  考生身份 =="原住民"  
replace minority = 2 if 考生身份 =="離島考生" | 考生身份 =="離島生" | 考生身份 == "願景計畫生"
label var minority "minority status of student"
label define minority 0 "regular student" 1 "aboriginal" 2 "remote islander"
label value minority minority

**create indicator of whether the respodnent atteded a public or private high school 
gen private =公私立別=="私立"
label var private "attended private high school==1"

*the following do file assign weights to various assessment components, the weights are taken from the 2023 NTU admission application booklet.
do "score weight 112.do"
label var testscorep "學測佔甄選總成績的%"
label var oralp "面試佔甄選總成績的%"
label var writtenp "筆試佔甄試總成績%"
label var reviewp "審查成績佔甄試總成績%"
label var written1 "筆試1佔甄試總成績%"
label var written2 "筆試2佔甄試總成績%"
label var written3 "筆試3佔甄試總成績%"

***create scores for GSAT, in-person interview, dossier review, written test 1-4, pages of dossier 
clonevar exam = 學測成績佔分
clonevar exam100= 學測成績
clonevar review =審查成績佔分
clonevar oral = 口試成績佔分
clonevar pen1 =筆試1
clonevar pen2 =筆試2
clonevar pen3 =筆試3
clonevar pen4 =筆試4
clonevar page =書審資料EP頁數
destring exam - page , force replace
recode page (.=0)  /*if not uploaded dossier, page=0*/
recode exam review oral pen1 - pen4 (-1=0)

label variable exam100 "Normalized GSAT score"
label variable exam "weighted GSAT score"
label variable review "weighted scores for dossier review"
label variable oral "weighted scores for in-person interview"
label variable pen1 "written test 1 score"
label variable pen2 "written test 2 score"
label variable pen3 "written test 3 score"
label variable pen4 "written test 4 score"
label variable page "pages of dossier"

*create variables for GSAT test score 
clonevar chinese = 學測國文級分	
clonevar english = 學測英文級分	
clonevar mathA = 學測數學A級分
clonevar mathB = 學測數學B級分
clonevar social = 學測社會級分
clonevar natural = 學測自然級分
label variable chinese "GSAT score for Chinese"
label variable english "GSAT score for English"
label variable mathA "GSAT score for Mathematics A"
label variable mathB "GSAT score for Mathematics B"
label variable social "GSAT score for social studies"
label variable natural "GSAT score for natural science"
**five departments use GSAT for screening purpose only, including departments of ocean engeneering, mathematics, dentistry, medicine, physics. We assign equal weight to each subject to calculate the normalized GSAT score.  
replace exam100 = [(english + mathA + natural)/45]*100 if (bname =="工程科學及海洋工程學系" | bname =="數學系")  & dsource==3
replace exam100 = [(chinese + english + mathA + natural)/60]*100 if (bname == "牙醫學系" | bname =="醫學系" | bname == "物理學系") & dsource ==3

*calculate the final score for the applicant:
*step 1: sum up weighted scores for GSAT, dossier review and in-person interview
*step 2: adjusted the scores by adding weighted scores from the written tests if written test is required by the department. The weight for written tests can be found in "application broshure 2023.pdf"
egen score = rowtotal(exam review oral)		if dsource ==3
replace score = score+pen1*0.2    			if bname=="歷史學系"  /*no written test 2022*/
replace score = score+pen1*0.2+pen2*0.15    if bname=="中國文學系"
replace score = score+pen1*0.45 			if bname=="動物科學技術學系"
replace score = score+pen1*0.30 			if bname=="化學工程學系"
replace score = score+pen1*0.25 			if bname=="化學系" 
replace score = score+pen1*0.30 			if bname=="土木工程學系"
replace score = score+pen1*0.20 			if bname== "外國語文學系"
replace score = score+pen1*0.25 			if bname=="大氣科學系"
replace score = score+pen1*0.55 	 		if bname=="數學系"
replace score = score+pen1*0.25 			if bname=="昆蟲學系"
replace score = score+pen1*0.25 			if bname=="材料科學與工程學系"
*replace score = score+pen1*0.15 			if bname=="植物病理與微生物學系" /*no written in 2023*/
replace score = score+pen1*0.30 			if bname=="機械工程學系"
replace score = score +pen1*.15+pen2*.2+((pen3+pen4)/2)*.25 if bname == "牙醫學系"
replace score = score+(pen1*.30+pen2*.25)	if bname=="物理學系" /* .32 .23 in 2022 year*/
replace score = score+pen1*0.25 			if bname=="生命科學系"
replace score = score+pen1*0.10+pen2*0.10	if bname=="藥學系" 
replace score = score+(pen1)*0.3			if 學系組名稱=="資訊工程學系" /*2022 == .4*/
replace score = score+(pen1)*0.2			if 學系組名稱=="資訊工程學系(APCS組)"  | 學系組名稱=="資訊工程學系(資安組)" /*資工分組*/
replace score = score+pen1*0.25 			if bname=="醫學工程學系"
replace score = score +((pen1+pen2)/2)*.1+((pen3+pen4)/2)*.4 if bname == "醫學系"
replace score = score+pen1*.15+pen2*.15		if bname=="電機工程學系"
replace score = score+(pen1)*0.5			if bname=="工程科學及海洋工程學系" 
sort  學系組名稱 outcome score 
label var score "甄選總成績"

**create rank within the department based on the final score
bysort  學系組名稱: egen rank= rank(score)  
label var rank "rank order within the department"
**create the percentile rank within the department by dividing the rank by the maximum rank in the department
bysort  學系組名稱: egen maxrank=max(rank) 
label var maxrank "全系排名最大值"
bysort  學系組名稱: gen rank100 = (rank/maxrank)*100
label var rank100 "Percentile rank within department"

**create indicator variable "penexam" to indicate whether the department require written exam
gen penexam =writtenp>0	 	
label var penexam "department has additional written exam"
***create indicator "oralexam" to indicate whether the departemnt require oral exam
gen oralexam = oralp >0 
label var oralexam "has oral exam"

**create types of exam required by the department
gen examform = 1 if oralexam==1 & penexam==0
replace examform =2 if oralexam==0 & penexam==1
replace examform =3 if oralexam==0 & penexam==0
replace examform =4 if oralexam==1 & penexam==1
label var examform "types of exam required"
label define examform 1 "oral=1, written=0" 2 "oral=0, written=1" 3 "oral=0, written=0"  4 "oral=1, written=1"
label value examform examform
*create indicator for students from remote island (kimman)
gen kimman=minority==2
*create indicator for students with aborigin family background
gen aborigin=minority==1
*create indicator for ronin status
destring 畢業年份, force replace
gen retake = 畢業年份<112
*create indicator for admiited, rejected, and on the waiting list
gen waiting=outcome==2 
gen admitted=outcome==3
gen rejected=outcome==1

label var kimman "student from remote island = 1"
label var aborigin "student with obriginal background = 1"
label var retake "ronin status = 1"
label var waiting "on waiting list = 1"
label var admitted "admitted = 1"
label var rejected "rejected =1 "

**create high school GPA
clonevar sch_pr = 五學期校平均相對表現
clonevar cluster_pr = 五學期群平均相對表現
clonevar class_pr = 五學期科班學程平均相對表現
clonevar hsch_gpa =  五學期平均成績
encode 科班學程別, generate(class)
**only exam108=1 has high school GPA and PR rank
encode  備審資料, generate(exam108)
destring hsch_gpa, force replace
clonevar id = 學測應試號碼
label variable id "application ID number"
label variable hsch_gpa "High school grade average"

**calculate no of students apply for NTU
gen accepted  = (統一分發後結果=="錄取" | 統一分發後結果 == "放棄")
label var accepted "the applicant was accepted into NTU"

***drop cases
drop if 學校代碼 == "" | 學校代碼=="0"  /*drop 35 students without high school information  total cases17358-35=17323*/
drop if 學系組名稱=="國際體育運動事務學士學位學程" /*drop 108 cases from the newly founded department 國際體育運動事務學士學位學程  17323-108 = 17215*/ 
drop if cityarea == ""  /*drop 68 cases with missing household addresss戶籍地址 17215-68=17147 */

/*the following syntax calculate the percentage of college graduates in each adminstrative district
the original dataset is available from Ministry of Interior's open data website, can be downloaded from:
https://data.gov.tw/dataset/117988
*/
preserve
clear
use "pop111.dta"  /*read in the original data from MOI*/
rename population pop
gen pop25=pop if age =="25~29歲" 
gen college = pop if (edu == "大畢" |  edu=="碩畢" | edu=="博畢")
gen college25 = pop if age =="25~29歲"  & (edu == "大畢" |  edu=="碩畢" | edu=="博畢")
collapse (sum) pop pop25 college college25, by(site_id)
gen zipcollegep = college/pop
gen zipcollegep25 = college25/pop25
rename site_id cityarea
keep cityarea pop pop25 college college25 zipcollegep zipcollegep25
save  "collegep111.dta", replace
restore

**merge zip code college students ratio
sort cityarea
merge m:1 cityarea using "collegep111.dta"  
drop if _merge==2
drop _merge
label var zipcollegep "% college gradautes in zipcode"
label var zipcollegep25 "% of college gradautes aged 25-29 in zipcode"

save "ntu_2023.dta", replace


*****************combine two years 
***read 2023 data
use "ntu_2023.dta", clear
gen year = 112
destring 郵遞區號  書審資料EP頁數, force replace
drop EPPDF

***append ntu_new.dta
append using  "ntu_2022.dta"

recode year (.=111)
drop 性別 性別代碼 類別 分發外加名額 本校程式代碼  

replace 單價平均 = 單價平均_萬 if year == 111
replace 單價標準差 = 單價標準差_萬 if year == 111
replace 單價中位數 = 單價標準差_萬 if year == 111
replace 單價最小值 = 單價最小值_萬 if year == 111
replace 單價最大值 = 單價最大值_萬 if year == 111
replace 總價平均   = 總價平均_萬 if year == 111
replace 建坪平均 = 建坪平均_坪 if year == 111
replace 地坪平均 = 地坪平均_坪 if year == 111
replace 屋齡平均  = 屋齡平均_年 if year == 111

drop  單價平均_萬  - 屋齡平均_年
replace zipcode = 戶籍郵遞  if year == 111
replace zipcode = int(zipcode/100) if zipcode >10000 & zipcode <.

**redefine dept 
drop dept
label drop dept_l
do "create department code.do"


**進入二階者沒有同級分篩選科目的資訊，用沒進入二階者的資訊填補
preserve
drop if dsource == 3
collapse (first)同級分超額篩選科目 , by(dept year)
sort dept
save "compare_subjects.dta", replace
restore

merge m:1 dept year using "compare_subjects.dta", update
replace 同級分超額篩選科目 = "英數A自" if 同級分超額篩選科目 =="" /*資工系 (APCS)*/

drop _merge

*calculate the rank1 scores if the scores are tied
gen rank1 = chinese + english if 同級分超額篩選科目 == "國英"
replace rank1 = chinese + english + mathA if 同級分超額篩選科目 == "國英數A"
replace rank1 = chinese + english + mathA + social if 同級分超額篩選科目 == "國英數A社"
replace rank1 = chinese + english + mathA + natural if 同級分超額篩選科目 == "國英數A自"
replace rank1 = chinese + english + mathB + social if 同級分超額篩選科目 == "國英數B社"
replace rank1 = chinese + english + social if 同級分超額篩選科目 == "國英社"
replace rank1 = english + mathA + social if 同級分超額篩選科目 == "英數A社"
replace rank1 = english + mathA + social + natural if 同級分超額篩選科目 == "英數A社自"
replace rank1 = english + mathA + natural if 同級分超額篩選科目 == "英數A自"

**calculate rank2 to rank4, the ranking criteria can be found in applicantion brosure.pdf 
do "rank1234.do"
**create weighted GSAT from the raw score, the weights can be found in application brosure.pdf
do "GSAT.do"
label var gsat "weighted GSAT score"

gen rank5=chinese /*add chinese composition as rank5*/

recode chinese - natural (-3.5/-1=.)
egen rank6= rowtotal(chinese english mathA mathB social natural) /*add total test score as rank6*/
save  "NTU2022_23.dta", replace





